In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

In [2]:
os.listdir(os.getcwd())


Out[2]:
['.ipynb_checkpoints',
 'gender_submission.csv',
 'test.csv',
 'train.csv',
 '[Kaggle] Titanic.ipynb']

In [3]:
# load data in
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.head(5)


Out[4]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [5]:
# initial look at the data
print(train.describe())
print(train.dtypes)


       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [6]:
# quite a fair bit of missing values
train.isnull().sum()


Out[6]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Cleaning


In [7]:
# start with sex
train.Sex.value_counts()


Out[7]:
male      577
female    314
Name: Sex, dtype: int64

In [8]:
# convert sex to 1 (male) and 0 (female)
def sexconverter(row):
    if row['Sex'] == 'male':
        return 1
    else:
        return 0
train['Sex'] = train.apply(sexconverter, axis=1)

In [9]:
# only 7 observations of less than 1 year old
train[train.Age < 1]


Out[9]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
78 79 1 2 Caldwell, Master. Alden Gates 1 0.83 0 2 248738 29.0000 NaN S
305 306 1 1 Allison, Master. Hudson Trevor 1 0.92 1 2 113781 151.5500 C22 C26 S
469 470 1 3 Baclini, Miss. Helene Barbara 0 0.75 2 1 2666 19.2583 NaN C
644 645 1 3 Baclini, Miss. Eugenie 0 0.75 2 1 2666 19.2583 NaN C
755 756 1 2 Hamalainen, Master. Viljo 1 0.67 1 1 250649 14.5000 NaN S
803 804 1 3 Thomas, Master. Assad Alexander 1 0.42 0 1 2625 8.5167 NaN C
831 832 1 2 Richards, Master. George Sibley 1 0.83 1 1 29106 18.7500 NaN S

In [10]:
# look at fare
# significant non-normality and right skewnewss
plt.figure(figsize=(10,10))
sns.distplot(train.Fare)


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x21dd7855908>

In [11]:
# fare and survival rate?
# already we can see that mainly men did not survive!
plt.figure(figsize=(10,10))
sns.swarmplot(x='Survived', y='Fare', hue='Sex', data=train)


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x21dd788d7f0>

In [12]:
# look at gender survival rates
tmp = pd.crosstab(index=train.Sex, columns=train.Survived, margins=True)
tmp


Out[12]:
Survived 0 1 All
Sex
0 81 233 314
1 468 109 577
All 549 342 891

In [13]:
# frequency?
tmp_freq = pd.crosstab(index=train.Sex, columns=train.Survived, margins=True, normalize="index")
tmp_freq


Out[13]:
Survived 0 1
Sex
0 0.257962 0.742038
1 0.811092 0.188908
All 0.616162 0.383838

In [14]:
# how about pclass and survival rates?
# use heatmap - mostly lower class people did not survive
plt.figure(figsize=(10,10))
tmp = pd.crosstab(index=train.Pclass, columns=[train.Survived, train.Sex])
sns.heatmap(tmp, cmap="plasma")


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x21dd8000940>

In [15]:
# how about age?
plt.figure(figsize=(10,10))
sns.violinplot(x="Survived", y="Age", data=train)
sns.swarmplot(x="Survived", y="Age", hue="Sex", alpha=0.5, data=train)


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x21dd7d9cef0>

Modeling

Simple Model: Gaussian Naive Bayes


In [34]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
import numpy as np
from sklearn.model_selection import train_test_split

In [17]:
clf = GaussianNB()

In [ ]:
# impute for missing Age values
train['Age'] = [np.mean(train.Age) if np.isnan(x) == True else x for x in train.Age]

In [29]:
X = train[['Sex','Age','Pclass']].values
y= train[['Survived']].values

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [32]:
clf.fit(X_train, y_train.ravel())


Out[32]:
GaussianNB(priors=None)

In [40]:
acc = metrics.accuracy_score(y_test, clf.predict(X_test))
print("Accuracy of GNB model is %.2f%%" % (acc*100))


Accuracy of GNB model is 75.42%

In [41]:
# plot ROC curve
probs = clf.predict_proba(X_test)
preds = probs[:,-1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr,tpr)

In [49]:
plt.figure(figsize=(15,15))
axis_font = {'fontname':'Arial', 'size':'22'}
plt.plot(fpr,tpr, 'b', label="ROC curve(area=%0.2f)" % roc_auc)
plt.plot([0,1],[0,1], "r--")
plt.xlabel("False Positive Rate",**axis_font)
plt.ylabel("True Positive Rate",**axis_font)
plt.legend(loc="lower right")


Out[49]:
<matplotlib.legend.Legend at 0x21dd995cf98>